Purpose: determine in what extent the current data can accurately describe correlations, underlying factors on the score. Especially concerning the answerTemporalities[0] groups: are there underlying groups explaining the discrepancies in score? Are those groups tied to certain questions?
In [ ]:
%run "../Functions/2.1 Sampling.ipynb"
Purpose: find out which questions have the more weight in the computation of the score.
Other leads: LDA, ANOVA.
Source for PCA: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html
In [ ]:
# all
#gfdf = gform.copy()
# only pairs
#gfdf = getPerfectPretestPostestPairs(gform)
# in the pairs, only volunteers
#gfdf = gfdf[~gfdf[QVolunteer].isin(yesNoPositives)]
# playtest's perfect pairs of phase 1
#gfdf = gfdfPlaytestPhase1PretestPosttestUniqueProfiles.copy()
# only the volunteers of this sample
gfdf = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
# only the pretests
pretests = gfdf[gfdf[QTemporality] == answerTemporalities[0]]
#gfdf = pretests
# only the posttests
posttests = gfdf[gfdf[QTemporality] == answerTemporalities[1]]
#gfdf = posttests
pretestPosttestConcatenation = False
saveFiles = False
gfdf.index = range(0, len(gfdf))
len(gfdf)
In [ ]:
if not pretestPosttestConcatenation:
len(gfdf[gfdf[QTemporality] == answerTemporalities[0]]),\
len(gfdf[gfdf[QTemporality] == answerTemporalities[1]]),\
len(gfdf)
In [ ]:
if pretestPosttestConcatenation:
pretests = pretests.sort_values(by=QUserId)
pretests.index = range(0, len(pretests))
posttests = posttests.sort_values(by=QUserId)
posttests.index = range(0, len(posttests))
pretestsbinarized = getAllBinarized(pretests)
pretestsbinarized.index = pretests.index
posttestsbinarized = getAllBinarized(posttests)
posttestsbinarized.index = posttests.index
else:
binarized = getAllBinarized(gfdf)
binarized.index = gfdf.index
In [ ]:
if pretestPosttestConcatenation:
pretestQPrefix = "pretest_"
pretestsbinarized.columns = [pretestQPrefix + x for x in pretestsbinarized.columns.values]
pretests.columns = [pretestQPrefix + x for x in pretests.columns.values]
posttestQPrefix = "posttest_"
posttestsbinarized.columns = [posttestQPrefix + x for x in posttestsbinarized.columns.values]
posttests.columns = [posttestQPrefix + x for x in posttests.columns.values]
binarized = pd.concat([pretestsbinarized,posttestsbinarized],axis=1)
gfdf = pd.concat([pretests,posttests],axis=1)
len(binarized)
In [ ]:
gfdf.shape, binarized.shape
In [ ]:
if pretestPosttestConcatenation:
scorePretest = np.dot(pretestsbinarized,np.ones(len(pretestsbinarized.columns)))
scorePosttest = np.dot(posttestsbinarized,np.ones(len(posttestsbinarized.columns)))
scoreTotal = scorePretest + scorePosttest
score = scorePretest
else:
score = np.dot(binarized,np.ones(len(binarized.columns)))
In [ ]:
dimensions = binarized.shape[1]
dimensions
In [ ]:
binarized['class'] = 'default'
In [ ]:
# split data table into data X and class labels y
X = binarized.iloc[:,0:dimensions].values
y = binarized.iloc[:,dimensions].values
In [ ]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
In [ ]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)
In [ ]:
print('NumPy covariance matrix: \n%s' %np.cov(X_std.T))
In [ ]:
cov_mat = np.cov(X_std.T)
eig_vals, eig_vecs = np.linalg.eig(cov_mat)
#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
In [ ]:
cor_mat1 = np.corrcoef(X_std.T)
if not pd.isnull(cor_mat1).any():
eig_vals, eig_vecs = np.linalg.eig(cor_mat1)
#print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
In [ ]:
u,s,v = np.linalg.svd(X_std.T)
In [ ]:
s
In [ ]:
for ev in eig_vecs:
np.testing.assert_array_almost_equal(1.0, np.linalg.norm(ev))
print('Everything ok!')
In [ ]:
# Make a list of (eigenvalue, eigenvector) tuples
eig_pairs = [(np.abs(eig_vals[i]), list(eig_vecs[:,i])) for i in range(len(eig_vals))]
# Sort the (eigenvalue, eigenvector) tuples from high to low
eig_pairs.sort()
eig_pairs.reverse()
# Visually confirm that the list is correctly sorted by decreasing eigenvalues
print('Eigenvalues in descending order:')
for i in eig_pairs:
print(i[0])
In [ ]:
if False:
#saved_eig_pairs = eig_pairs.copy()
np.array([len(x) for x in eig_pairs])
np.array([len(x[1]) for x in eig_pairs])
np.array([type(x[1]) for x in eig_pairs])
#np.array([len(x) for x in saved_eig_pairs])
#np.array([len(x[1]) for x in saved_eig_pairs])
#np.array([type(x[1]) for x in saved_eig_pairs])
#saved_eig_pairs[0]
eig_pairs[0]
np.array([pd.isnull(x[1]).any() for x in saved_eig_pairs]).any(),np.array([pd.isnull(x[1]).any() for x in eig_pairs]).any()
In [ ]:
tot = sum(eig_vals)
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(6, 4))
plt.bar(range(dimensions), var_exp, alpha=0.5, align='center',
label='individual explained variance')
plt.step(range(dimensions), cum_var_exp, where='mid',
label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
In [ ]:
var_exp[:5]
In [ ]:
cum_var_exp[:5]
In [ ]:
matrix_w = np.hstack((np.array(eig_pairs[0][1]).reshape(dimensions,1),
np.array(eig_pairs[1][1]).reshape(dimensions,1)))
print('Matrix W:\n', matrix_w)
In [ ]:
basecolors = ('green','red','blue','magenta','cyan','purple','yellow','black','white')
colors = basecolors
len(colors)
In [ ]:
Y = X_std.dot(matrix_w)
In [ ]:
with plt.style.context('seaborn-whitegrid'):
plt.figure(figsize=(6, 4))
ax = plt.subplot(111)
plt.scatter(Y[:, 0], Y[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title("base PCA")
plt.show()
In [ ]:
# creates a scatter plot using different colors for different classes
# answerIndices: index of 'gfdf' and 'binarized' DataFrames
# Y: 2D position in PCA for answers
# classNames: list of class names
# classes: list of series of class-index indexed UserIds
# title: str
# rainbow: whether to use rainbow colors
# figsize: for matplotlib
def classifyAndPlot(answerIndices, Y, classNames, classes, title = '', rainbow = False, figsize = (12, 8)):
%matplotlib nbagg
defaultClassName = ''
sampleSize = 0
# sets the name of the default class
for classIndex in range(0, len(classes)):
sampleSize += len(classes[classIndex])
if(sampleSize < len(answerIndices)):
if(len(classNames) == len(classes) + 1):
defaultClassName = classNames[-1]
else:
defaultClassName = 'other'
classNames.append(defaultClassName)
# y is the 'class' container
y = pd.Series(index = answerIndices, data = defaultClassName)
# set the class of each answer
for classIndex in range(0, len(classes)):
y[classes[classIndex]] = classNames[classIndex]
if (defaultClassName in y.values) and (not (defaultClassName in classNames)):
print("unexpected error: check the exhaustiveness of the provided classes")
with plt.style.context('seaborn-whitegrid'):
plots = pd.Series()
# update function to control the alpha channel
def updateAlpha(alpha):
if(len(plots) > 0):
for lab in classNames:
plots.loc[lab].set_alpha(alpha)
proxyArtists = []
for lab, col in zip(classNames,colors):
proxyArtists.append(plt.scatter([], [], label=lab, c=col, alpha=alpha, marker='o', s=150))
plots.loc[classNames[0]].axes.legend(proxyArtists, classNames, loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
# creates the slider to control the alpha channel
interact(updateAlpha, alpha=(0.0,1.0,0.01));
thisFigure = plt.figure(figsize=figsize)
ax = plt.subplot(111)
colors = basecolors
if (rainbow or len(classNames) > len(colors)):
colors = plt.cm.rainbow(np.linspace(1, 0, len(classNames)))
colors = colors[:len(classNames)]
for lab, col in zip(classNames,colors):
# y == lab is a selector:
# Y[y==lab, 0] selects all Y.x of class lab
# Y[y==lab, 0] selects all Y.y of class lab
xvalues = Y[y==lab, 0]
yvalues = Y[y==lab, 1]
#print("'" + str(lab) + "': " + str(len(xvalues)) + " values in " + str(col))
plots.loc[lab] = plt.scatter( xvalues,
yvalues,
label=lab,
c=[col],
alpha=0.2,
s=150
)
#print("scatter classes: [" + '; '.join(interactiveGraphClassNames) + "]")
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
# source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
# Put a legend to the right of the current axis
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
if(len(title) > 0):
plt.title(title)
plt.show()
return plots
In [ ]:
def updateQuestionIndex(q):
question = gfdf.columns[q]
print("question " + str(q) + ": " + question)
classNames = []
classes = []
for answer in gfdf[question].value_counts(dropna = False).index:
classNames.append(str(answer))
classes.append(gfdf[gfdf[question].apply(str) == str(answer)].index)
classifyAndPlot(gfdf.index, Y, classNames, classes, title = question, rainbow = False)
#interact(updateQuestionIndex, q=(0,len(gfdf.columns),1));
In [ ]:
#updateQuestionIndex(q)
In [ ]:
interactiveY = Y.copy()
interactivey = []
# the list of unique colors used
interactiveColors = []
interactiveGraphClassNames = []
interactiveGraphClasses = []
interactiveGraphPlots = np.nan
interactiveFigure = np.nan
interactiveGraphAx = np.nan
questionInteractive = np.nan
alphaInteractive = np.nan
interactiveTitle = ''
In [ ]:
if pretestPosttestConcatenation:
pretestPossibleAnswers = possibleAnswers.copy()
pretestPossibleAnswers.index = pretests.columns
posttestPossibleAnswers = possibleAnswers.copy()
posttestPossibleAnswers.index = posttests.columns
possibleAnswersConcat = pd.concat([pretestPossibleAnswers, posttestPossibleAnswers], axis = 0)
In [ ]:
def classPreprocess(gfdf, question, answersToCheckAgainst = possibleAnswers):
global interactiveGraphClassNames, interactiveGraphClasses
if pretestPosttestConcatenation:
answersToCheckAgainst = possibleAnswersConcat
interactiveGraphClassNames = []
interactiveGraphClasses = []
if len(answersToCheckAgainst[question]) > 0:
interactiveGraphClassNames = answersToCheckAgainst[question].copy()
else:
interactiveGraphClassNames = [str(x) for x in gfdf[question].unique()]
interactiveGraphClassNames.sort()
for answer in interactiveGraphClassNames:
interactiveGraphClasses.append(gfdf[gfdf[question].apply(str) == answer].index)
In [ ]:
def commonClassProcess(gfdf):
global interactivey
global interactiveGraphClassNames, interactiveGraphClasses
defaultClassName = ''
sampleSize = 0
# sets the name of the default class
for classIndex in range(0, len(interactiveGraphClasses)):
sampleSize += len(interactiveGraphClasses[classIndex])
if(sampleSize < len(gfdf.index)):
if(len(interactiveGraphClassNames) == len(interactiveGraphClasses) + 1):
defaultClassName = interactiveGraphClassNames[-1]
else:
defaultClassName = 'other'
interactiveGraphClassNames.append(defaultClassName)
# y is the 'class' container
interactivey = pd.Series(index = gfdf.index, data = defaultClassName)
# set the class of each answer
for classIndex in range(0, len(interactiveGraphClasses)):
interactivey[interactiveGraphClasses[classIndex]] = interactiveGraphClassNames[classIndex]
if (defaultClassName in interactivey.values) and (not (defaultClassName in interactiveGraphClassNames)):
print("unexpected error: check the exhaustiveness of the provided classes")
In [ ]:
def plotClasses(rainbow):
global alphaInteractive
global interactiveColors
global interactiveY, interactivey
global interactiveGraphClassNames
global interactiveGraphPlots, interactiveGraphAx
interactiveColors = basecolors
if (rainbow or len(interactiveGraphClassNames) > len(interactiveColors)):
interactiveColors = plt.cm.rainbow(np.linspace(1, 0, len(interactiveGraphClassNames)))
interactiveColors = interactiveColors[:len(interactiveGraphClassNames)]
if pd.isnull(interactiveGraphPlots):
interactiveGraphPlots = plt.scatter( interactiveY[:, 0],
interactiveY[:, 1],
label='-',
c='yellow',
alpha=alphaInteractive.value,
s=150
)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
#print("scatter classes: [" + '; '.join(interactiveGraphClassNames) + "]")
fullColors = interactivey.copy()
proxyArtists = []
for lab, col in zip(interactiveGraphClassNames,interactiveColors):
fullColors[interactivey == lab] = pd.Series(data = [col] * len(interactivey[interactivey == lab]), index = interactivey[interactivey == lab].index)
proxyArtists.append(plt.scatter([],
[],
label=lab,
c=col,
alpha=alphaInteractive.value,
s=150
))
interactiveGraphPlots.set_color(fullColors)
#print("for classes: [" + '; '.join(interactiveGraphClassNames) + "]: \n\tfullcolors=[" + '; '.join(fullColors) + "]")
# source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
# Put a legend to the right of the current axis
lgd = interactiveGraphAx.legend(proxyArtists, interactiveGraphClassNames, loc='center left', bbox_to_anchor=(1, 0.5))
if(len(interactiveTitle) > 0):
plt.title(interactiveTitle)
In [ ]:
# creates a scatter plot using different colors for different interactiveGraphClasses
# gfdf: base survey answers
# Y: 2D position in PCA for answers
# interactiveGraphClassNames: list of class names
# interactiveGraphClasses: list of series of class-index indexed UserIds
# title: str
# rainbow: whether to use rainbow colors
# figsize: for matplotlib
def complexClassifyAndPlot(
gfdf,
Y,
classNames = [],
classes = [],
title = '',
rainbow = False,
figsize = (12,8),
questionIndex=1,
):
%matplotlib nbagg
global questionInteractive, alphaInteractive
global interactiveGraphPlots,\
interactiveFigure,\
interactiveGraphAx,\
interactiveGraphClassNames,\
interactiveGraphClasses,\
interactivey
interactiveGraphPlots = np.nan
interactiveGraphClassNames = classNames
interactiveGraphClasses = classes
fullyInteractive = (len(interactiveGraphClassNames) == 0 or len(interactiveGraphClasses) == 0)
if fullyInteractive:
# questions to avoid:
# 1.52
#questionRange = chain(range(1,3), range(4,40), range(42,44))
# 1.52.2
#questionRange = chain(range(1,6), range(7,42), range(44,45))
#forbiddenQuestions = [QTimestamp, QAge, QRemarks, QUserId]
forbiddenQuestions = [QRemarks, QUserId]
def updateQuestionIndex(question=questionIndex):
#print("updateQuestionIndex(" + str(question) + ")")
global interactiveTitle
global interactiveGraphClassNames, interactiveGraphClasses
chosenQuestion = gfdf.columns[question]
while chosenQuestion in forbiddenQuestions:
question = (question + 1) % len(gfdf.columns)
chosenQuestion = gfdf.columns[question]
interactiveTitle = "Q" + str(question) + ": '" + chosenQuestion + "'"
classPreprocess(gfdf, chosenQuestion)
commonClassProcess(gfdf)
if pd.notnull(interactiveGraphPlots):
plotClasses(rainbow)
plt.show()
questionInteractive = IntSlider(value=questionIndex, min=0, max=len(gfdf.columns)-1, step=1)
interactive(updateQuestionIndex, question=questionInteractive)
display(questionInteractive)
with plt.style.context('seaborn-whitegrid'):
defaultAlphaValue = 0.5
# update function to control the alpha channel
def updateAlpha(alpha = defaultAlphaValue):
global interactiveColors
global interactiveGraphPlots
if pd.notnull(interactiveGraphPlots):
interactiveGraphPlots.set_alpha(alpha)
fullColors = interactivey.copy()
proxyArtists = []
for lab, col in zip(interactiveGraphClassNames,interactiveColors):
proxyArtists.append(plt.scatter([], [], label=lab, c=col, alpha=alpha, s=150))
# source https://stackoverflow.com/questions/4700614/how-to-put-the-legend-out-of-the-plot
# Put a legend to the right of the current axis
lgd = interactiveGraphAx.legend(proxyArtists, interactiveGraphClassNames, loc='center left', bbox_to_anchor=(1, 0.5))
#interactiveFigure.savefig('samplefigure', bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.show()
# creates the slider to control the alpha channel
alphaInteractive = FloatSlider(value=defaultAlphaValue, min=0.0, max=1.0, step=0.01)
interactive(updateAlpha, alpha=alphaInteractive);
display(alphaInteractive)
interactiveFigure = plt.figure(figsize=figsize)
#interactiveGraphAx = plt.subplot(121)
interactiveGraphAx = plt.subplot(111)
if fullyInteractive:
updateQuestionIndex(questionIndex)
else:
commonClassProcess(gfdf)
plotClasses(rainbow)
In [ ]:
#gform.loc[:, ['Name: Plasmid', 'Function: TER', 'Name: PR', 'Function - game: CDS', 'Name: TER', 'Function - biology: CDS', 'Name: RBS', 'Example: CDS', 'Name: CDS', 'Function: PR', 'Function: RBS', 'Function: Plasmid', 'Name: Operator XXX']]
In [ ]:
#complexClassifyAndPlot(gfdf, Y, rainbow=True, figsize = (15, 5), questionIndex=12);
In [ ]:
## pb = 1 color with 4 subvalues not accepted to initialize n-indexed series
#fullColors = interactivey.copy()
#for lab, col in zip(interactiveGraphClassNames,interactiveColors):
# fullColors[interactivey == lab] = pd.Series(data = [col] * len(interactivey[interactivey == lab]), index = interactivey[interactivey == lab].index)
In [ ]:
complexClassifyAndPlot(
gfdf,
Y,
classNames = [],
classes = [],
title = '',
rainbow = True,
figsize = (12,8),
questionIndex=1,
)
In [ ]:
if saveFiles:
#if True:
import time
for qIndex in range(0, len(gfdf.columns)):
complexClassifyAndPlot(gfdf, Y, rainbow=True, figsize = (15, 5), questionIndex=qIndex);
time.sleep(0.3)
%matplotlib nbagg
time.sleep(0.1)
questionTitle = "Q" + str(qIndex) + "_'" + gfdf.columns[qIndex].replace(" ", "_").replace(":", "") + "'"
try:
interactiveFigure.savefig(questionTitle)
except:
print("- savefig failed for " + questionTitle)
In [ ]:
sortedScore
In [ ]:
if pretestPosttestConcatenation:
# scorePretest
# scorePosttest
# scoreTotal
score = scorePosttest - scorePretest
pcaComponent1 = interactiveY[:, 0].copy()
#pcaComponent1 = (max(pcaComponent1) - pcaComponent1)
#pcaComponent1 = pcaComponent1 * (max(score) / max(pcaComponent1))
#pcaComponent1.sort()
sortedScore = score.copy()
#sortedScore.sort()
fig = plt.figure(figsize=(12,8))
ax = plt.subplot(121)
pcaScat = plt.scatter(range(0,len(pcaComponent1)),pcaComponent1, c= 'blue', alpha=0.7)
scoreScat = plt.scatter(range(0,len(sortedScore)),sortedScore, c='red', alpha=0.7)
#ax.legend([pcaScat, scoreScat], ['pca', 'score'], loc='center left', bbox_to_anchor=(1, 0.5))
ax.legend([pcaScat, scoreScat], ['pca', 'score'], loc='center left')
plt.title("Comparison of score with the value of PCA component 1")
plt.plot()
ax2 = plt.subplot(122)
scorePcaScat = plt.scatter(pcaComponent1, sortedScore, c= 'green', alpha=0.7)
plt.title("Score vs value of PCA component 1")
plt.xlabel("PCA component 1")
plt.ylabel("score")
plt.plot()
In [ ]:
if False:
answered = binarized[binarized[QBBExampleCDS] == 1]
indices = answered.index
surveys = gfdf.iloc[indices].index
classifyAndPlot(gfdf.index, Y, ['guessed', 'did not'], [surveys]);
In [ ]:
if False:
classifyAndPlot(gfdf.index, Y, ['biologist', 'non-biologist'], [getSurveysOfBiologists(gfdf, False).index], title = 'biologists and non-biologists');
In [ ]:
if False:
classifyAndPlot(gfdf.index, Y, ['gamer', 'non-gamer'], [getSurveysOfGamers(gfdf, True).index], title = 'gamers and non-gamers');
In [ ]:
if False:
classNames = []
classes = []
for answer in gfdf[QInterestBiology].value_counts().index:
classNames.append(answer)
classes.append(gfdf[gfdf[QInterestBiology] == answer].index)
classNames.append('other')
classifyAndPlot(gfdf.index, Y, classNames, classes, rainbow = True, title = 'interest in biology');
In [ ]:
#np.plot(score)
In [ ]:
if False:
np.unique(score),classNames
In [ ]:
if True:
classNames = []
classes = []
for thisScore in np.unique(score):
classNames.append(str(thisScore))
index = np.where(score == thisScore)[0]
classes.append(index)
thesePlots = classifyAndPlot(gfdf.index, Y, classNames, classes, rainbow = True, title = 'score')
In [ ]:
if False:
classNames = []
classes = []
question = QAge
pretests = gfdf[gfdf[QTemporality] == answerTemporalities[0]]
for answer in np.sort(pretests[question].unique()):
classNames.append(str(answer))
classes.append(pretests[pretests[question] == answer].index)
classifyAndPlot(gfdf.index, Y, classNames, classes, rainbow = True, title = 'age');
In [ ]:
eig_vals
In [ ]:
eig_vecs[0]
In [ ]:
maxComponentIndex = np.argmax(abs(eig_vecs[0]))
binarized.columns[maxComponentIndex]
In [ ]:
sum(eig_vecs[0]*eig_vecs[0])
eig_vecs[0]
In [ ]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[0]))[::-1]
for sortedComponent in descendingWeights:
sortedIndices.append(np.where(abs(eig_vecs[0]) == sortedComponent)[0][0])
sortedQuestions0 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions0
In [ ]:
def accessFirst(a):
return a[0]
sortedQuestionsLastIndex = 10
array1 = np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.)
import matplotlib.cm as cm
sortedQuestionsLastIndex+1,\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Accent(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Dark2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Paired(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Pastel2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set1(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set2(array1)))),\
len(np.unique(np.apply_along_axis(accessFirst, 1, cm.Set3(array1)))),\
In [ ]:
from matplotlib import cm
def displayQuestionsContributions(\
sortedQuestions,\
title = "Contributions of questions to component",\
sortedQuestionsLastIndex = 10\
):
colors=cm.Set3(np.arange(sortedQuestionsLastIndex+1.)/(sortedQuestionsLastIndex + 1.))
sortedQuestionsLabelsArray = np.append(sortedQuestions.values.flatten()[:sortedQuestionsLastIndex], 'others')
sortedQuestionsValuesArray = np.append(sortedQuestions.index[:sortedQuestionsLastIndex], sum(sortedQuestions.index[sortedQuestionsLastIndex:]))
fig1, ax1 = plt.subplots()
ax1.pie(sortedQuestionsValuesArray, labels=sortedQuestionsLabelsArray, autopct='%1.1f%%', startangle=100, colors = colors)
ax1.axis('equal')
# cf https://matplotlib.org/users/customizing.html
plt.rcParams['patch.linewidth'] = 0
plt.rcParams['text.color'] = '#2b2b2b'
plt.title(title)
plt.tight_layout()
plt.show()
In [ ]:
displayQuestionsContributions(sortedQuestions0, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 1')
In [ ]:
sum(sortedQuestions0.index**2)
In [ ]:
sortedIndices = []
descendingWeights = np.sort(abs(eig_vecs[1]))[::-1]
for sortedComponent in descendingWeights:
sortedIndices.append(np.where(abs(eig_vecs[1]) == sortedComponent)[0][0])
sortedQuestions1 = pd.DataFrame(index = descendingWeights, data = binarized.columns[sortedIndices])
sortedQuestions1
In [ ]:
displayQuestionsContributions(sortedQuestions1, sortedQuestionsLastIndex = 10, title = 'Contributions of questions to component 2')
In [ ]:
sum(sortedQuestions1.index**2)